
clear all

// SET MACROS
global Input  "Y:/limited/Michigan_CTE/funding_change/data_raw"
global Output "Y:/limited/Michigan_CTE/funding_change/data_derived"
set seed 20150605

* Clean 2019 Data to append
use "O:\data\base_cte\data_temp\archived\2007-2019\course\course_2019_imported.dta", clear


*** Generate year
gen year = 2019


*** Clean CIPCODES

replace cipcode = cipcode + "." if !regexm(cipcode, "\.")
replace cipcode = cipcode + "0000" if strlen(cipcode) == 2 | strlen(cipcode) == 3
replace cipcode = cipcode + "000" if strlen(cipcode) == 4
replace cipcode = cipcode + "00" if strlen(cipcode) == 5
replace cipcode = "0" + cipcode if strlen(cipcode) == 6


*** get missing obnos:

* List all codes based on names (missing in master data)
preserve
use "O:/data_final/epi_cte/cte_program_enrollment.dta", clear
drop if year>2020
keep obno oano year
duplicates drop
decode obno, gen(obname)
decode oano, gen(oaname)
drop if obname==""
tempfile names
save `names'
restore

* Update all with school name matches
merge m:1 obname oaname year using `names', update keep(1 3 4 5)

* Replace any with some within-school issues (based on year)
bys oaname obname (obno): gen obno2 = obno[1] 
assert obno==obno2 if obname!="" & obno!=. 
replace obno=obno2 if obno==.

*** get missing psns

* Replace any with some within-school issues 
bys obno cipcode (psn): gen psn1 = psn[1] 
count if  psn==psn1 & psn!=. & !inlist(obno,0,.)
replace psn=psn1 if psn==. & !inlist(obno,0,.)

keep courseid cipcode year obno psn


tempfile courses
save `courses'

* Get 2007-2019 course info
use courseid year psn cipcode obno using "O:\data\base_cte\data_final\archived\2007-2018\cte_courses.dta", clear
append using `courses'
tab year
save `courses', replace

* Open new course files
use ric year course_id cipcode psn course_begin_date course_end_date using "O:/data/base_cte/data_final/cte_course_enrollment.dta", clear
gduplicates drop 
ren course_id courseid

* Merge in Missing CIP and PSN (2007-2019)

merge m:1 courseid year using `courses', update keep(1 3 4 5) 
drop _m

* Get Start and End to think about Courses
gen length_month = month(course_end_date)-month(course_begin_date) + 12*(year(course_end_date)-year(course_begin_date)) 
gen length_days = round(course_end_date - course_begin_date)

gen course_per_year = 2 if year!=2019
replace course_per_year = 1 if length_days>250 &year!=2019
replace course_per_year = 3 if length_days<125 & year!=2019
replace course_per_year = 4 if length_days<63 &year!=2019


*Clean up
bys courseid (obno): gen o1 = obno[1]
replace obno = o1 if obno==.
bys courseid (cip): gen c1 = cipcode[_N]
replace cipcode = c1 if cipcode==""
bys courseid (psn): gen p1 = psn[1]
replace psn = p1 if psn==.

* Get most recent course per year impute to weird years
gen hasc = course_per_year!=.
gsort courseid -hasc -year
bys courseid: gen y1 = course_per_year[1]
replace course_per_year = y1 if inlist(year,2007,2008,2009,2019)

drop o1 c1 p1 y1  hasc


*Cooking gets relabeled
replace cipcode = "12.0500" if cipcode == "12.9999"

* Some cips not federaly recognized
drop if inlist(cipcode,"19.0000","BT.0000","EM.0000", "HE.0000", "HU.0000", "NR.0000")


ren ric student

save ${Output}/student_course_data.dta, replace





* Approximate for psns only in 2019
gen r1 = runiform()
bys courseid: replace course_per_year = 2 if course_per_year==. & r1[1]<.75
bys courseid: replace course_per_year = 3 if course_per_year==. & r1[1]>=.75&r1[1]<.95
bys courseid: replace course_per_year = 4 if course_per_year==. & r1[1]>=.95

drop r1

** Count courses and semesters
bys psn student cipcode: gen course_count = _N
bys psn student cipcode: egen cpy = mode(course_per_year), min

gen total_semesters = 1
replace total_semesters = 2 if course_count/cpy> .5
replace total_semesters = 3 if course_count/cpy> 1
replace total_semesters = 4 if course_count/cpy> 1.5
replace total_semesters = 5 if course_count/cpy> 2
replace total_semesters = 6 if course_count/cpy> 2.5




keep student total_semesters course_count psn cipcode
gduplicates drop



save ${Output}/course_count.dta, replace



